This notebook is an exploratory analysis of the date for the "Predicting Project Success" project.
Member of the team:
Imports // Helper Functions // Read Dataset
from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').show();
} else {
$('div.input').hide();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import os
import math
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
# BOKEH IMPORTS
from bokeh.layouts import gridplot
from bokeh.plotting import figure, output_notebook, show
from bokeh.models import HoverTool, ColumnDataSource
output_notebook()
# Plotly imports
from plotly import express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode()
import missingno as msno
# Avoid scientific notation output in Pandas
# pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.options.display.float_format = '{:,.2f}'.format
import logging
# Improve resolution of output graphcis
%config InlineBackend.figure_format ='retina'
def count_chart(d, count_this, by_this, function='unique'):
"""
Plot counts by column
"""
if function == 'unique':
grouping = d.groupby(by_this).nunique()
elif function == 'count':
grouping = d.groupby(by_this).count()
grouping['left'] = range(0, len(grouping))
grouping['right'] = grouping['left'].values + 1
tools = [HoverTool(
tooltips=[
(f"{by_this}", f"@{by_this}"),
(f"Num {count_this}'s", f"@{count_this}"),]
)
]
p = figure(title=f"Count of Number of {count_this}'s by {by_this}", tools=tools,
plot_height=500, plot_width=700, background_fill_color="#fafafa",
x_range=list(grouping.index))
p.quad(source=ColumnDataSource(grouping[[count_this, 'left', 'right']]),
top=count_this, bottom=0, left='left', right='right',
line_color="white", alpha=0.7)
p.y_range.start = 0
p.xaxis.axis_label = f'{by_this}'
p.yaxis.axis_label = f"Num {count_this}'s"
p.grid.grid_line_color="white"
p.xaxis.major_label_orientation = math.pi/3
show(p)
def hist_chart(d, count_this, for_each):
num = d.groupby(by=for_each).nunique()[count_this]
_left = [x for x in range(1, max(num))]
_right = list(np.array(_left) + 1)
hist, _ = np.histogram(num, bins=len(_left))
tools = [HoverTool(
tooltips=[
(f"This many '{for_each}'", "@top"),
(f"Have this many '{count_this}'", "@left"),
]
)
]
p = figure(title=f"Count of Number of '{count_this}'", tools=tools,
plot_height=300, plot_width=700, background_fill_color="#fafafa")
p.quad(top=hist, bottom=0, left=_left, right=_right, line_color="white", alpha=0.7) #, fill_color="navy", )
p.y_range.start = 0
p.xaxis.axis_label = f"Num '{count_this}'"
p.yaxis.axis_label = f"Num '{for_each}'"
p.grid.grid_line_color="white"
show(p)
def plot_hist_comps(df, metric_1, metric_2, y_log=False, bins=20):
"""Plots side-by-side histograms for comparison with log yscale option
"""
metrics_list = [metric_1, metric_2]
metrics_str = [
metric.replace('_', ' ').upper() for metric in metrics_list
]
fig, ax = plt.subplots(1, 2, sharey=True, figsize=(12, 4))
plt.suptitle(
'Projects by {} and {}'.format(*metrics_str),
fontsize=18
)
for (i, ax), metric_col, metric_name in zip(enumerate(ax), metrics_list, metrics_str):
ax.hist(df[metric_col], bins=bins, alpha=0.7)
ax.set_xlabel(metric_name, fontsize=14)
ax.grid(':', alpha=0.4)
if i==0:
ax.set_ylabel('frequency', fontsize=12)
if y_log:
ax.set_yscale('log')
if i==0:
ax.set_ylabel('frequency (log scale)', fontsize=12)
plt.tight_layout(rect=[0, 0.03, 1, .94])
plt.show()
def plot_change_trend(trend_data, pid_data, pid):
"""Plots 4 subplots showing project budget and duration forecast change trend
"""
# sets default for converting datetimes in matplotlib
from pandas.plotting import register_matplotlib_converters
from matplotlib.dates import YearLocator, DateFormatter
register_matplotlib_converters()
years = YearLocator()
years_fmt = DateFormatter('%Y')
def set_date_axis(ax, years, years_fmt):
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(years_fmt)
fig, ax = plt.subplots(2,2, sharex=True, figsize=(12,6))
pid_record = pid_data.loc[pid_data['PID']==pid]
pid_changes = trend_data.loc[trend_data['PID']==pid]
project_duration = pid_record['Original_Duration'].values[0] + \
np.cumsum(pid_changes['Latest_Schedule_Changes'].values)
plt.suptitle(
'PID {}: {}\nCategory: {}\nBorough: {}\ninitial duration: {:,.0f} days'.format(
pid,
pid_record['Project_Name'].values[0][:72],
pid_record['Category'].values[0],
pid_record['Borough'].values[0],
pid_record['Original_Duration'].values[0]
), fontsize=16
)
# plot budget forecast
ax[0,0].plot(
pid_changes['Date_Reported_As_Of'], pid_changes['Budget_Forecast']/1e7, 'ko-'
)
ax[0,0].set_title('Total budget forecast')
ax[0,0].set_ylabel('USD (millions)')
# plot budget forecast percent change
ax[1,0].plot(
pid_changes['Date_Reported_As_Of'],
((pid_changes['Latest_Budget_Changes'])/
(pid_changes['Budget_Forecast']-pid_changes['Latest_Budget_Changes']))*100,
'ko-'
)
ax[1,0].axhline(0, color='k', linestyle=':')
ax[1,0].set_title('Percentage budget change')
ax[1,0].set_ylabel('percent change')
ax[1,0].set_xlabel('project change date')
# plot duration forecast
ax[0,1].plot(
pid_changes['Date_Reported_As_Of'], project_duration/1e3, 'ko-'
)
ax[0,1].set_title('Total forecasted project duration')
ax[0,1].set_ylabel('days (thousands)')
# plot duration change
ax[1,1].plot(
pid_changes['Date_Reported_As_Of'],
(pid_changes['Latest_Schedule_Changes'] /
(project_duration - pid_changes['Latest_Schedule_Changes']))*100,
'ko-'
)
ax[1,1].axhline(0, color='k', linestyle=':')
ax[1,1].set_title('Percentage duration change')
ax[1,1].set_ylabel('percent change')
ax[1,1].set_xlabel('project change date')
for a in ax.flat:
a.grid(':', alpha=0.4)
set_date_axis(a, years, years_fmt)
plt.tight_layout(rect=[0, 0.03, 1, .81])
plt.show()
file_path = '../data/Capital_Projects.csv'
if os.path.isfile(file_path):
print("OK - path points to file.")
else:
print("ERROR - check the 'file_path' and ensure it points to the source file.")
data = pd.read_csv(file_path)
# entries
print(f"Number of dataset records: {len(data)}")
# num projects
print(f"Number of unique projects in dataset: {len(data['PID'].unique())}")
# Use 'underscores' in column headers instead of spaces
old_cols = list(data.columns)
new_cols = [s.replace(' ', '_') for s in old_cols]
rename_dict = {k:v for k,v in zip(old_cols, new_cols)}
data.rename(columns=rename_dict, inplace=True)
# Change Date fields to date-type
data['Date_Reported_As_Of'] = pd.to_datetime(data['Date_Reported_As_Of'])
data['Design_Start'] = pd.to_datetime(data['Design_Start'])
data['Forecast_Completion'] = pd.to_datetime(data['Forecast_Completion'])
# make sure data is sorted properly
data = data.sort_values(by=['PID', 'Date_Reported_As_Of'])
To find "weird" partial missing patterns we followed these steps:
weird_missing_pids = []
data_missing = data.drop(columns = ["Latest_Budget_Changes", "Latest_Schedule_Changes"])
for pid, df in data_missing.groupby("PID"):
has_missing_columns = df.isnull().any().any()
if has_missing_columns:
missing_columns = df.columns[df.isnull().any()]
for col in missing_columns:
entire_col_missing = df[col].isnull().all()
if not entire_col_missing:
weird_missing_pids.append(pid)
break
logging.warning(f"Found weird missing pattern for {len(weird_missing_pids)} projects")
## Print all weird missing pids
# for pid, df in data_missing.query("PID in @weird_missing_pids").groupby("PID"):
# print(f"Missing patern for project {pid}")
# msno.matrix(df)
# plt.show()
missing_example_pids = [470, 546, 534, 553, 959]
for pid, df in data_missing.query("PID in @missing_example_pids").groupby("PID"):
print(f"Missing patern for project {pid}")
msno.matrix(df)
plt.show()
# rename phases to indicate thier sequence
rename_phases = {
'IT':'0-IT',
'Scoping/Planning':'1-Scoping/Planning',
'Design':'2-Design',
'Construction Procurement':'3-Construction Procurement',
'Construction':'4-Construction',
'Close-Out':'5-Close-Out'}
data.Current_Phase = data.Current_Phase.map(rename_phases)
# Create an original budget column
# The first line in any project is the source of the original budget.
# The original budget for the entire project can be calculated from the
# first line of any project:
# Original_Budget = Budget_Forecast - Latest_Budget_Changes
# if 'Latest_Budget_Changes' is NaN, make it 0 (this is the best info we have)
data['Latest_Budget_Changes'] = data['Latest_Budget_Changes'].fillna(0)
# loop through projects - create dict of original_budget values
orig_budgets_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID].sort_values(by='Date_Reported_As_Of')
original_budget = df.iloc[0]['Budget_Forecast']-df.iloc[0]['Latest_Budget_Changes']
orig_budgets_dict.update({PID: original_budget} )
# Several Projects need manual updates because of poor data quality
for k,v in orig_budgets_dict.items():
if math.isnan(v):
print(k, v)
orig_budgets_dict.update({569: 28932000.00})
orig_budgets_dict.update({595: 0})
orig_budgets_dict.update({598: 0})
orig_budgets_dict.update({600: 0})
orig_budgets_dict.update({696: 0})
orig_budgets_dict.update({697: 0})
orig_budgets_dict.update({707: 0})
orig_budgets_dict.update({800: 0})
orig_budgets_dict.update({810: 0})
orig_budgets_dict.update({870: 0})
orig_budgets_dict.update({898: 27500000})
orig_budgets_dict.update({899: 27500000})
orig_budgets_dict.update({900: 27400000})
orig_budgets_dict.update({901: 27500000})
orig_budgets_dict.update({932: 0})
orig_budgets_dict.update({934: 0})
# apply the original budget values to each project
# create columns series and add it to the dataframe
def update_budget(s):
return orig_budgets_dict.get(s.PID)
data.insert(10, 'Original_Budget', data.apply(update_budget, axis=1))
# Check that no values are NaN
data[data['Original_Budget'].isna()]
# Where text descriptions are missing use 'not_specified'
data['Description'] = data['Description'].fillna('not_specified')
data['Borough'] = data['Borough'].fillna('not_specified')
data['Current_Phase'] = data['Current_Phase'].fillna('not_specified')
data['Client_Agency'] = data['Client_Agency'].fillna('not_specified')
# several projects have final lines that have no information - remove them
drop_idxs = data[(data['Current_Phase']=='not_specified') & (data['Design_Start'].isna()) & (data['Latest_Schedule_Changes'].isna()) & (data['Budget_Forecast'].isna()) & (data['Latest_Budget_Changes']==0)].index
data = data.drop(index=drop_idxs)
# NaN values per column
data.isna().sum()
# Fix entries where Total_Budget_Changes is nan
# Can be calculated as the sum of all 'Latest_Budget_Changes'
# loop through projects - create dict of 'Total_Budget_Changes' values
ttl_budget_changes_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID].sort_values(by='Date_Reported_As_Of')
# if no nan's, just take the last ttl value
if len(df['Total_Budget_Changes'].isna()) == 0:
ttl_budget_changes_dict.update({PID: df.iloc[-1]['Total_Budget_Changes']})
else: # there is at least one nan
# if they are all nan - calc as the sum of 'Latest_Budget_Changes'
if data[data['PID']==PID]['Total_Budget_Changes'].isna().all():
ttl_budget_changes_dict.update({PID: df['Latest_Budget_Changes'].sum()})
else: # not all are nan, take the max of 'Total_Budget_Changes'
ttl_budget_changes_dict.update({PID: df['Total_Budget_Changes'].max()})
# apply the total budget changes values to each project
# create columns series and add it to the dataframe
def update_changes(s):
return ttl_budget_changes_dict.get(s.PID)
data['Total_Budget_Changes'] = data.apply(update_changes, axis=1)
# Update/Correct Latest_Schedule_Changes that have nan values
# if the first entry has a nan, make it 0
# loop through projects - create dict of 'Total_Budget_Changes' values
latest_schedule_changes_list = []
new_PID = False
prev_row = None
for i, row in data.iterrows():
# see if this is the first entry for a project
try:
if row.PID != prev_row.PID:
new_PID = True
except:
new_PID = True
x = row['Latest_Schedule_Changes']
# if nan and first entry, make 0
if pd.isna(row['Latest_Schedule_Changes']) and new_PID:
x = 0
# if nan and not first row, calculate
elif pd.isna(row['Latest_Schedule_Changes']) and not new_PID:
# if forecast_completion is nan, we can back into the value
if pd.isna(row['Forecast_Completion']) or pd.isna(prev_row['Forecast_Completion']):
# if the changes are all accounted for, make the entry 0
if row['Total_Schedule_Changes'] - data[data.PID == row.PID]['Latest_Schedule_Changes'].sum() == 0:
x = 0
# see if this is the only NA row, we can back into the value
elif data[data.PID == row.PID]['Latest_Schedule_Changes'].isna().sum() == 1:
x = row['Total_Schedule_Changes'] - data[data.PID == row.PID]['Latest_Schedule_Changes'].sum()
# othewise, can't calculate
else: # there is a forecast_completion date and we can easily calculte the change
x = row['Forecast_Completion'] - prev_row['Forecast_Completion']
latest_schedule_changes_list.append(x)
new_PID = False
prev_row = row.copy()
data['Latest_Schedule_Changes'] = latest_schedule_changes_list
# this project is a lost cause
data[data['Latest_Schedule_Changes'].isna()]
# Now, calculate any missing Forecast_Completion values
fc_completion_list = []
new_PID = False
prev_row = None
for i, row in data.iterrows():
# see if this is the first entry for a project
try:
if row.PID != prev_row.PID:
new_PID = True
except:
new_PID = True
x = row['Forecast_Completion']
if pd.isna(x) and not new_PID:
# we can back into the date
x = prev_row['Forecast_Completion'] + timedelta(int(row['Latest_Schedule_Changes']))
fc_completion_list.append(x)
data['Forecast_Completion'] = fc_completion_list
# now calculate any missing Total_Schedule_Changes
ttl_fc_changes_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID].sort_values(by='Date_Reported_As_Of')
# if no nan's, just take the last ttl value
if len(df['Total_Schedule_Changes'].isna()) == 0:
ttl_fc_changes_dict.update({PID: df.iloc[-1]['Total_Schedule_Changes']})
else: # there is at least one nan
# if they are all nan - calc as the sum of 'Latest_Schedule_Changes'
if data[data['PID']==PID]['Total_Schedule_Changes'].isna().all():
ttl_fc_changes_dict.update({PID: df['Latest_Schedule_Changes'].sum()})
else: # not all are nan, take the max of 'Total_Schedule_Changes'
ttl_fc_changes_dict.update({PID: df['Total_Schedule_Changes'].max()})
def update_changes(s):
return ttl_fc_changes_dict.get(s.PID)
data['Total_Schedule_Changes'] = data.apply(update_changes, axis=1)
# Create an original schedule column
# in the first line of a project, back into the origonal date
# this is the best that we can do
try:
data.drop(columns=['Original_Schedule'], inplace=True)
except:
pass
orig_sched_dict = {}
for PID in data['PID'].unique():
df = data[data['PID']==PID]
x = df.iloc[0]['Forecast_Completion'] - timedelta(days=int(df.iloc[0]['Latest_Schedule_Changes']))
# if the first record couldn't be used, use the last record
if pd.isna(x):
x = df.iloc[-1]['Forecast_Completion']- timedelta(days=int(df.iloc[-1]['Total_Schedule_Changes']))
orig_sched_dict.update({PID: x})
def update_schedule(s):
return orig_sched_dict.get(s.PID)
data.insert(14, 'Original_Schedule', data.apply(update_schedule, axis=1))
data.isna().sum()
# All projects in 'Scoping/Planning' phase have no scheduled date yet. Make themm = 0.
x_idx = data[data['Current_Phase']=='1-Scoping/Planning'].index
for i in x_idx:
data.loc[i] = data.loc[i].fillna(0)
data.isna().sum()
How many updates do we have for each project?
hist_chart(d=data, count_this='Date_Reported_As_Of', for_each='PID')
How many updates do we have for each project Category?
px.histogram(data, x="Date_Reported_As_Of", color ="Category", barmode="group")
px.histogram(data, x="Date_Reported_As_Of", color ="Category", barmode="group", facet_col="Managing_Agency", facet_col_wrap = 4)
Distribution of projects by categories
count_chart(d=data, count_this='PID', by_this='Category')
Distribution of projects by Boroughs
count_chart(d=data, count_this='PID', by_this='Borough')
count_chart(d=data, count_this='PID', by_this='Managing_Agency')
count_chart(d=data, count_this='PID', by_this='Client_Agency')
count_chart(d=data, count_this='PID', by_this='Current_Phase')
# updates per phase
count_chart(d=data, count_this='Date_Reported_As_Of', by_this='Current_Phase', function='count')
data.info()
# identify records with '0' value 'Design_Start', causing datetime conversion errors
drop_PID = list(data.loc[data['Design_Start']==0]['PID'].values)
n_drop_PID_records = len(data.loc[data['Design_Start']==0]['PID'])
n_records = len(data)
# print summary of records
print(
'The following projects have zero-valued "Design_Start" dates and '\
'will be dropped, resulting in {} total records dropped:\n\n\t{}\n'\
''.format(n_drop_PID_records, drop_PID)
)
# drop identified PID records
data = data.copy().loc[~data['PID'].isin(drop_PID)]
print(
'The resulting dataframe, now contains {:,} records'.format(len(data))
)
# re-establish datetime columns as correct type
date_cols = [
'Date_Reported_As_Of',
'Forecast_Completion',
'Design_Start',
'Original_Schedule'
]
for col in date_cols:
data[col] = pd.to_datetime(data[col])
# print summary stats for datetime cols
data[date_cols].describe()
# fix invalid year "2108", assuming it was a keying error
data.loc[data['Forecast_Completion']=='2108-10-21', 'Forecast_Completion'] = pd.Timestamp('2018-10-21')
# print summary stats for datetime cols once more
data[date_cols].describe()
# verify resuling data types
data.info()
# convert dataframe single record for each PID
df_pid = data.groupby('PID').agg(
{
'Project_Name': 'first',
'Category': 'first',
'Borough': 'first',
'Managing_Agency': 'first',
'Client_Agency': 'first',
'Date_Reported_As_Of': 'count',
'Original_Budget': 'first',
'Design_Start': min,
'Original_Schedule': 'first',
'Forecast_Completion': 'last',
'Total_Schedule_Changes': max,
'Total_Budget_Changes': max
}
).reset_index()
# rename column for number of changes
df_pid = df_pid.rename(columns={'Date_Reported_As_Of': 'Number_Changes'})
# define original forecasted project duration
df_pid['Original_Duration'] = (df_pid['Original_Schedule'] - df_pid['Design_Start']).dt.days
# define budget change ratio
df_pid['Budget_Change_Ratio'] = df_pid['Total_Budget_Changes']/df_pid['Original_Budget']
# define schedule change ratio
df_pid['Schedule_Change_Ratio'] = df_pid['Total_Schedule_Changes']/df_pid['Original_Duration']
# Drop projects with missing or zero values for each of the following columns
drop_filter_cols = [
'Original_Budget',
'Design_Start',
'Original_Schedule'
]
# define df.loc filter to simplify list comprehension and drop functions
loc_filter = lambda df, col: (df[col]==0) | (df[col].isnull())
# store list of tuples identify the PID and corresponding category of each to be dropped
drop_pids = sum(
[
list(
zip(
[*df_pid.loc[loc_filter(df_pid, col)]['PID'].values],
[*df_pid.loc[loc_filter(df_pid, col)]['Category'].values]
)
) for col in drop_filter_cols
], []
)
# print summary of PIDs to be dropped
print(
'The following PIDs will be dropped because they have missing or '\
'zero values for the columns:\n\n\t{}\n\nPID\tCategory'.format(drop_filter_cols)
)
for pid in drop_pids:
print('{}\t{}'.format(*pid))
# drop PIDs from dataframe
for col in drop_filter_cols:
df_pid = df_pid.copy().loc[~loc_filter(df_pid, col)]
# print summary of remaining PIDs
print(
'\nThe resulting dataframe contains {} unique projects\n'.format(len(df_pid))
)
# generate scatter matrix with each quantitative metric
scatter_cols = [
'Number_Changes',
'Original_Budget',
'Total_Budget_Changes',
'Budget_Change_Ratio',
'Original_Duration',
'Total_Schedule_Changes',
'Schedule_Change_Ratio'
]
g = sns.pairplot(df_pid[scatter_cols], plot_kws={'alpha': 0.3, 's': 100})
g.fig.suptitle('Distribution of budget and duration changes by project', fontsize=22, y=1.01)
plt.show();
plot_hist_comps(df_pid, 'Original_Budget', 'Original_Duration', y_log=True)
plot_hist_comps(df_pid, 'Total_Budget_Changes', 'Total_Schedule_Changes', y_log=True)
plot_hist_comps(df_pid, 'Budget_Change_Ratio', 'Schedule_Change_Ratio', y_log=True)
def identify_edge_pids(df, metric_col, n_pids=2,
change_threshold=5, change_col='Number_Changes'):
"""Identifies PIDs with minimum and maximum values for any given metric
"""
loc_filter = df[change_col]>=change_threshold
max_pids = list(
df.loc[loc_filter].sort_values(by=metric_col, ascending=True)[-n_pids:]['PID']
)
min_pids = list(
df.loc[loc_filter].sort_values(by=metric_col, ascending=True)[:n_pids]['PID']
)
return {'min': min_pids, 'max': max_pids}
metric_col_list = [
'Original_Budget',
'Original_Duration',
'Budget_Change_Ratio',
'Schedule_Change_Ratio',
]
change_threshold = 5
edge_pids_dict = {
metric_col: identify_edge_pids(
df_pid, metric_col, n_pids=2, change_threshold=change_threshold
)
for metric_col in metric_col_list
}
print(
'Projects with {} or more change records, and the largest and '\
'smallest metric values are as follows:\n'.format(change_threshold)
)
for key, value in edge_pids_dict.items():
print('{}\n'.format(key))
for edge, pid_list in value.items():
print('{}:\t{}'.format(edge, pid_list))
print()
pid_plot_list = [
603, 555, 480, 96, 594, 574, 482
]
for pid in pid_plot_list:
plot_change_trend(data, df_pid, pid)
print()
Metrics calculated:
df_pid = df_pid.eval('Budget_To_Date = Original_Budget + Total_Budget_Changes')\
.eval('Budget_Abs_Per_Error = abs(Original_Budget - Budget_To_Date)/Budget_To_Date')\
.eval('Budget_Rel_Per_Error = abs(Original_Budget - Budget_To_Date)/Original_Budget')\
.eval('Budget_Ratios = Budget_To_Date/Original_Budget')
df_pid
df_pid_melted = df_pid.melt(id_vars=set(df_pid.columns) - set(["Budget_Abs_Per_Error", "Budget_Rel_Per_Error", 'Budget_Ratios']), value_vars=["Budget_Abs_Per_Error", "Budget_Rel_Per_Error", 'Budget_Ratios'], var_name="metric", value_name="metric_value")
df_pid_melted
px.histogram(df_pid_melted,x="metric_value", facet_col="metric", color="Category", barmode="overlay" )
px.histogram(df_pid_melted,x="metric_value", facet_col="metric", facet_row= "Category", color="Borough", barmode="overlay" , height=5000)
fig = px.line(data.sort_values(['Date_Reported_As_Of', "PID"]), x='Date_Reported_As_Of', y="Latest_Budget_Changes", color= "Category", line_dash="PID", hover_name="PID", height=1000)
fig.update_xaxes(
rangeslider_visible=True,
rangeselector=dict(
buttons=list([
dict(count=1, label="1m", step="month", stepmode="backward"),
dict(count=6, label="6m", step="month", stepmode="backward"),
dict(count=1, label="YTD", step="year", stepmode="todate"),
dict(count=1, label="1y", step="year", stepmode="backward"),
dict(step="all")
])
)
)
fig
fig = px.line(data.sort_values(['Date_Reported_As_Of', "PID"]), x='Date_Reported_As_Of', y="Latest_Budget_Changes", facet_col = "Borough", color= "Category", hover_name="PID", line_group="PID", facet_col_wrap=5, height= 2000)
fig
Stratify by managing agency
fig = px.line(data.sort_values(['Date_Reported_As_Of', "PID"]), x='Date_Reported_As_Of', y="Latest_Budget_Changes", facet_col = "Managing_Agency", color= "Category", hover_name="PID", line_group="PID", facet_col_wrap=5, height= 2000)
fig
fig = px.line(data.sort_values(['Date_Reported_As_Of', "PID"]), x='Date_Reported_As_Of', y="Latest_Budget_Changes", facet_col = "Client_Agency", color= "Category", hover_name="PID", line_group="PID", facet_col_wrap=5, height= 2000)
fig
Poor data quality.
Data is inconsistent in quality. Human-entered data will always have inconsistencies. We will need to work around or with that. Any model will need more data. We will need to source more project data from other regions and types of projects to increase the variety Lots of missing data. To feed a model, some data engineering will be necessary. The types of dat that need to be created are not yet understood. Any efforts to normalize data will need to consider normalization within the project as well as within the entire dataset.